Load in packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)

library(tmap)
library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
set.seed(123)

Load in data to use

bangladesh_gw <- read_csv("https://stahlm.github.io/ENS_215/Data/NationalSurveyData_DPHE_BGS_LabData.csv")
## Rows: 3534 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): SAMPLE_ID, WELL_TYPE, DIVISION, DISTRICT, THANA, UNION, MOUZA
## dbl (24): LAT_DEG, LONG_DEG, YEAR_CONSTRUCTION, WELL_DEPTH_m, As_ugL, Al_mgL...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Prepare data for clustering

Select variables to use in clustering

table_cluster <- bangladesh_gw %>% 
  select(SAMPLE_ID,
         Ba_mgL, 
         Ca_mgL, 
         Fe_mgL, 
         Mn_mgL,
         K__mgL, 
         Mg_mgL, 
         Na_mgL, 
         Si_mgL, 
         SO4_mgL, 
         Sr_mgL) %>% 
  
  drop_na() # need to drop any samples with NA values since can't cluster sites with missing data

Name the rows with the SAMPLE_ID variable

table_cluster <- column_to_rownames(table_cluster, var = "SAMPLE_ID")

Rescale the data

table_cluster_scaled <- scale(table_cluster)

PCA

res.pca <- PCA(table_cluster_scaled,  graph = FALSE)
# Control variable colors using their contributions
fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             )

Cluster the data

fviz_nbclust(
  table_cluster_scaled,
  pam,
  k.max = 10,
  method = "wss",
  diss = get_dist(table_cluster, method = "spearman")
  #diss = get_dist(table_cluster, method = "euclidean")
) 

n_clust = 6
#k_n <- kmeans(table_cluster_scaled, centers = n_clust, nstart = 100, iter.max = 15000)


k_n = cluster::pam(table_cluster_scaled, k = n_clust)
fig_cluster <- fviz_cluster(k_n, data = table_cluster_scaled, ellipse = T, palette = "Set2",
                            geom = "point") +
  theme_classic()
fig_cluster

df_cluster_info <- tibble(SAMPLE_ID = row.names(table_cluster_scaled),
                          cluster_id = k_n$cluster)
bangladesh_gw <- bangladesh_gw %>% 
  left_join(df_cluster_info)
## Joining with `by = join_by(SAMPLE_ID)`
bangladesh_gw <- bangladesh_gw %>% 
  mutate(cluster_id = factor(cluster_id))
bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  ggplot(aes(x = cluster_id, y = As_ugL, fill = cluster_id)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  
  scale_y_log10() +
  
  theme_classic() +
  theme(legend.position = "none")

bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  ggplot(aes(x = cluster_id, y = Fe_mgL, fill = cluster_id)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  
  scale_y_log10() +
  
  theme_classic() +
  theme(legend.position = "none")

bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  ggplot(aes(x = cluster_id, y = SO4_mgL, fill = cluster_id)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  
  scale_y_log10() +
  
  theme_classic() +
  theme(legend.position = "none")

bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  ggplot(aes(x = cluster_id, y = Si_mgL, fill = cluster_id)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  
  scale_y_log10() +
  
  theme_classic() +
  theme(legend.position = "none")

bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  ggplot(aes(x = cluster_id, y = WELL_DEPTH_m, fill = cluster_id)) +
  geom_boxplot() +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  
  scale_y_log10() +
  
  theme_classic() +
  theme(legend.position = "none")
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

summary(bangladesh_gw$cluster_id)
##    1    2    3    4    5    6 NA's 
##  356  599  731  639  811  393    5
sf_bangladesh_gw <- bangladesh_gw %>% 
  st_as_sf(coords = c("LONG_DEG", "LAT_DEG"))
tmap_mode("view")
## tmap mode set to interactive viewing
sf_bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  tm_shape() +
  tm_dots(col = "cluster_id", palette = "Set2",
          popup.vars = c("cluster_id", "WELL_DEPTH_m", "As_ugL")
          ) +

  tm_mouse_coordinates() +
  
  tm_basemap(server = c("Esri.WorldImagery", "OpenStreetMap", "Esri.WorldShadedRelief")) +
  
  tm_scale_bar() 
## Warning: Currect projection of shape . unknown. Long-lat (WGS84) is assumed.
sf_bangladesh_gw %>% 
  filter(!is.na(cluster_id)) %>% 
  
  tm_shape() +
  tm_dots(col = "As_ugL", palette = "viridis", breaks = c(0,5,10,50,100,200),
          popup.vars = c("cluster_id", "WELL_DEPTH_m", "As_ugL")
          ) +

  tm_mouse_coordinates() +
  
  tm_basemap(server = c("Esri.WorldImagery", "OpenStreetMap", "Esri.WorldShadedRelief")) +
  
  tm_scale_bar() + 
  tm_facets(by = "cluster_id", sync = T)
## Warning: Currect projection of shape . unknown. Long-lat (WGS84) is assumed.
## Warning: Values have found that are higher than the highest break